Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 0 additions & 52 deletions bert/bert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,59 +50,7 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.labels)

# Function to compute metrics for evaluation
def compute_metrics(eval_pred):
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

predictions, labels = eval_pred
predictions = predictions.argmax(axis=1)

accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}

# Training arguments for the Trainer
def get_training_args(output_dir='./results'):
from transformers import TrainingArguments

return TrainingArguments(
output_dir=output_dir,
eval_strategy='epoch',
save_strategy='epoch',
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
num_train_epochs=3,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model='accuracy',
logging_dir='./logs',
logging_steps=10
)

# Prediction function to use after training
def predict_sentiment(model, tokenizer, texts):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
encodings = {key: val.to(device) for key, val in encodings.items()}

with torch.no_grad():
# Pass only input_ids and attention_mask for prediction
outputs = model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'])
# The model should return only logits when labels are not provided
logits = outputs

predictions = torch.argmax(logits, dim=1).cpu().numpy()
# Ensure the sentiment map covers all possible prediction indices (0, 1, 2)
sentiment_map = {0: 'anti', 1: 'neutral', 2: 'pro'}
return [sentiment_map[pred] for pred in predictions]

2 changes: 1 addition & 1 deletion bert/bert_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import torch
from transformers import AutoTokenizer, Trainer
from sklearn.metrics import classification_report
from model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment
from NLP_climate_analysis.models.BERT.model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment

def main():
# Check for CUDA
Expand Down
8,311 changes: 8,311 additions & 0 deletions data/test_data.csv

Large diffs are not rendered by default.

55,466 changes: 27,733 additions & 27,733 deletions train_data.csv → data/train_data.csv

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
56 changes: 56 additions & 0 deletions models/BERT/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig
from torch.nn import CrossEntropyLoss

class ClimateModel(nn.Module):
def __init__(self):
super().__init__()
config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=3)
self.model = AutoModel.from_pretrained("distilbert-base-uncased", config=config)
self.classifier = nn.Linear(self.model.config.hidden_size, 3) # Output size is 3 for anti, neutral, pro
self.dropout = nn.Dropout(0.1)

def forward(self, input_ids, attention_mask, labels=None):
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
# Get the hidden state of the [CLS] token (first token)
pooled_output = outputs.last_hidden_state[:, 0]
# Apply dropout
pooled_output = self.dropout(pooled_output)
# Apply classification layer
logits = self.classifier(pooled_output)

loss = None
# Calculate loss if labels are provided
if labels is not None:
loss_fct = CrossEntropyLoss()
# Ensure logits and labels shapes are compatible for CrossEntropyLoss
# Logits: [batch_size, num_labels], Labels: [batch_size]
loss = loss_fct(logits.view(-1, 3), labels.view(-1)) # Use 3 for num_labels

# Return loss and logits in a tuple if loss was calculated
# The Trainer expects this format when labels are provided
if loss is not None:
return (loss, logits)
else:
# During evaluation or prediction without labels, just return logits
return logits # Or return SequenceClassifierOutput(logits=logits)

# Dataset class for the climate sentiment data
class ClimateDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels

def __getitem__(self, idx):
item = {key: val[idx] for key, val in self.encodings.items()}
item['labels'] = self.labels[idx]
return item

def __len__(self):
return len(self.labels)





110 changes: 110 additions & 0 deletions models/BERT/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pandas as pd
import torch
from transformers import AutoTokenizer, Trainer
from sklearn.metrics import classification_report
from NLP_climate_analysis.models.BERT.model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment

def main():
# Check for CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load data
print("Loading data...")
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Ensure all messages are strings and handle potential NaN values
train_df['message'] = train_df['message'].fillna('').astype(str)
test_df['message'] = test_df['message'].fillna('').astype(str)

# Tokenize texts
print("Tokenizing texts...")
train_encodings = tokenizer(
train_df['message'].tolist(),
truncation=True,
padding=True,
max_length=128,
return_tensors='pt'
)

test_encodings = tokenizer(
test_df['message'].tolist(),
truncation=True,
padding=True,
max_length=128,
return_tensors='pt'
)

# Convert encodings to dataset format
train_dataset = ClimateDataset(
{k: v.squeeze() for k, v in train_encodings.items()},
torch.tensor(train_df['sentiment'].tolist())
)

test_dataset = ClimateDataset(
{k: v.squeeze() for k, v in test_encodings.items()},
torch.tensor(test_df['sentiment'].tolist())
)

# Initialize model
model = ClimateModel()

# Get training arguments
training_args = get_training_args(output_dir='./results')

# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics,
)

# Train model
print("Training model...")
trainer.train()

# Evaluate model
print("Evaluating model...")
evaluation = trainer.evaluate()
print(f"Evaluation results: {evaluation}")

# Generate predictions for test data
print("Generating predictions...")
test_outputs = trainer.predict(test_dataset)
predicted_labels = test_outputs.predictions.argmax(-1)
true_labels = test_df['sentiment'].values

# Generate classification report
target_names = ['anti', 'neutral', 'pro']
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=target_names))

# Save model
print("Saving model...")
trainer.save_model('./saved_model')
tokenizer.save_pretrained('./saved_model')

print("Model training and evaluation complete!")

# Example prediction
example_texts = [
"Climate change is a real threat that requires immediate action.",
"I'm not sure if humans are causing climate change or if it's natural.",
"Global warming is a hoax created by scientists for grant money."
]

predictions = predict_sentiment(model, tokenizer, example_texts)

print("\nExample predictions:")
for text, pred in zip(example_texts, predictions):
print(f"Text: {text}")
print(f"Prediction: {pred}\n")

if __name__ == "__main__":
main()
53 changes: 53 additions & 0 deletions models/BERT/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments

# Function to compute metrics for evaluation
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = predictions.argmax(axis=1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}

# Training arguments for the Trainer
def get_training_args(output_dir='./results'):
return TrainingArguments(
output_dir=output_dir,
evaluation_strategy='epoch',
save_strategy='epoch',
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
num_train_epochs=3,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model='accuracy',
logging_dir='./logs',
logging_steps=10
)

# Prediction function to use after training
def predict_sentiment(model, tokenizer, texts):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
encodings = {key: val.to(device) for key, val in encodings.items()}

with torch.no_grad():
# Pass only input_ids and attention_mask for prediction
outputs = model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'])
# The model should return only logits when labels are not provided
logits = outputs

predictions = torch.argmax(logits, dim=1).cpu().numpy()
# Ensure the sentiment map covers all possible prediction indices (0, 1, 2)
sentiment_map = {0: 'anti', 1: 'neutral', 2: 'pro'}
return [sentiment_map[pred] for pred in predictions]
16 changes: 16 additions & 0 deletions models/BilSTM/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import torch
from torch.utils.data import Dataset

class TweetDataset(Dataset):
"""
Custom PyTorch dataset for sentiment classification.
"""
def __init__(self, inputs, labels):
self.inputs = torch.tensor(inputs, dtype=torch.long)
self.labels = torch.tensor(labels, dtype=torch.long)

def __len__(self):
return len(self.labels)

def __getitem__(self, idx):
return self.inputs[idx], self.labels[idx]
62 changes: 62 additions & 0 deletions models/BilSTM/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

def predict(model, tokenizer, text_list, word_index, embedding_dim=100, max_len=100, device='cpu'):
"""
Predict sentiment for a list of input texts using the trained BiLSTM model.
Returns the predicted sentiment labels.
"""
from keras.preprocessing.sequence import pad_sequences # type: ignore
from keras.preprocessing.text import Tokenizer # type: ignore

model.eval()
model.to(device)

# Use the same tokenizer that was fit on training data
sequences = tokenizer.texts_to_sequences(text_list)
padded = pad_sequences(sequences, maxlen=max_len)
inputs = torch.tensor(padded, dtype=torch.long).to(device)

with torch.no_grad():
outputs = model(inputs)
preds = torch.argmax(outputs, dim=1).cpu().numpy()

label_map = {0: 'anti', 1: 'neutral', 2: 'pro'}
return [label_map[p] for p in preds]

def evaluate_model(model, test_loader, device, label_names):
"""
Evaluate the model on the test set and print metrics.
"""
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
for inputs, labels in test_loader:
inputs = inputs.to(device)
outputs = model(inputs)
preds = torch.argmax(outputs, dim=1).cpu().numpy()
all_preds.extend(preds)
all_labels.extend(labels.numpy())

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=label_names))
print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

cm = confusion_matrix(all_labels, all_preds)
plot_confusion_matrix(cm, label_names)


def plot_confusion_matrix(cm, class_names):
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("bilstm_confusion_matrix.png")
plt.close()
Loading