nedcut · Vsyengo · May 1, 2025 · May 2, 2025 · May 2, 2025
diff --git a/bert/bert_model.py b/bert/bert_model.py
@@ -50,59 +50,7 @@ def __getitem__(self, idx):
     def __len__(self):
         return len(self.labels)
 
-# Function to compute metrics for evaluation
-def compute_metrics(eval_pred):
-    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
-
-    predictions, labels = eval_pred
-    predictions = predictions.argmax(axis=1)
-
-    accuracy = accuracy_score(labels, predictions)
-    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
-
-    return {
-        'accuracy': accuracy,
-        'precision': precision,
-        'recall': recall,
-        'f1': f1
-    }
 
-# Training arguments for the Trainer
-def get_training_args(output_dir='./results'):
-    from transformers import TrainingArguments
-
-    return TrainingArguments(
-        output_dir=output_dir,
-        eval_strategy='epoch',
-        save_strategy='epoch',
-        learning_rate=2e-5,
-        per_device_train_batch_size=16,
-        per_device_eval_batch_size=32,
-        num_train_epochs=3,
-        weight_decay=0.01,
-        load_best_model_at_end=True,
-        metric_for_best_model='accuracy',
-        logging_dir='./logs',
-        logging_steps=10
-    )
 
-# Prediction function to use after training
-def predict_sentiment(model, tokenizer, texts):
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.to(device)
-    model.eval()
 
-    encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
-    encodings = {key: val.to(device) for key, val in encodings.items()}
-
-    with torch.no_grad():
-        # Pass only input_ids and attention_mask for prediction
-        outputs = model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'])
-        # The model should return only logits when labels are not provided
-        logits = outputs
-
-    predictions = torch.argmax(logits, dim=1).cpu().numpy()
-    # Ensure the sentiment map covers all possible prediction indices (0, 1, 2)
-    sentiment_map = {0: 'anti', 1: 'neutral', 2: 'pro'}
-    return [sentiment_map[pred] for pred in predictions]
 
diff --git a/bert/bert_train.py b/bert/bert_train.py
@@ -2,7 +2,7 @@
 import torch
 from transformers import AutoTokenizer, Trainer
 from sklearn.metrics import classification_report
-from model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment
+from NLP_climate_analysis.models.BERT.model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment
 
 def main():
     # Check for CUDA

diff --git a/data/test_data.csv b/data/test_data.csv
diff --git a/train_data.csv → data/train_data.csv b/train_data.csv → data/train_data.csv
diff --git a/train_test.py → data/train_test.py b/train_test.py → data/train_test.py
diff --git a/twitter_sentiment_data.csv → data/twitter_sentiment_data.csv b/twitter_sentiment_data.csv → data/twitter_sentiment_data.csv
diff --git a/models/BERT/model.py b/models/BERT/model.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoConfig
+from torch.nn import CrossEntropyLoss
+
+class ClimateModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=3)
+        self.model = AutoModel.from_pretrained("distilbert-base-uncased", config=config)
+        self.classifier = nn.Linear(self.model.config.hidden_size, 3) # Output size is 3 for anti, neutral, pro
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, input_ids, attention_mask, labels=None):
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        # Get the hidden state of the [CLS] token (first token)
+        pooled_output = outputs.last_hidden_state[:, 0]
+        # Apply dropout
+        pooled_output = self.dropout(pooled_output)
+        # Apply classification layer
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        # Calculate loss if labels are provided
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Ensure logits and labels shapes are compatible for CrossEntropyLoss
+            # Logits: [batch_size, num_labels], Labels: [batch_size]
+            loss = loss_fct(logits.view(-1, 3), labels.view(-1)) # Use 3 for num_labels
+
+        # Return loss and logits in a tuple if loss was calculated
+        # The Trainer expects this format when labels are provided
+        if loss is not None:
+            return (loss, logits)
+        else:
+            # During evaluation or prediction without labels, just return logits
+            return logits # Or return SequenceClassifierOutput(logits=logits)
+
+# Dataset class for the climate sentiment data
+class ClimateDataset(torch.utils.data.Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+
+    def __getitem__(self, idx):
+        item = {key: val[idx] for key, val in self.encodings.items()}
+        item['labels'] = self.labels[idx]
+        return item
+
+    def __len__(self):
+        return len(self.labels)
+
+
+
+
+
diff --git a/models/BERT/train.py b/models/BERT/train.py
@@ -0,0 +1,110 @@
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, Trainer
+from sklearn.metrics import classification_report
+from NLP_climate_analysis.models.BERT.model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment
+
+def main():
+    # Check for CUDA
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+
+    # Load data
+    print("Loading data...")
+    train_df = pd.read_csv('train_data.csv')
+    test_df = pd.read_csv('test_data.csv')
+
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
+
+    # Ensure all messages are strings and handle potential NaN values
+    train_df['message'] = train_df['message'].fillna('').astype(str)
+    test_df['message'] = test_df['message'].fillna('').astype(str)
+
+    # Tokenize texts
+    print("Tokenizing texts...")
+    train_encodings = tokenizer(
+        train_df['message'].tolist(), 
+        truncation=True, 
+        padding=True, 
+        max_length=128,
+        return_tensors='pt'
+    )
+
+    test_encodings = tokenizer(
+        test_df['message'].tolist(), 
+        truncation=True, 
+        padding=True, 
+        max_length=128,
+        return_tensors='pt'
+    )
+
+    # Convert encodings to dataset format
+    train_dataset = ClimateDataset(
+        {k: v.squeeze() for k, v in train_encodings.items()}, 
+        torch.tensor(train_df['sentiment'].tolist())
+    )
+
+    test_dataset = ClimateDataset(
+        {k: v.squeeze() for k, v in test_encodings.items()}, 
+        torch.tensor(test_df['sentiment'].tolist())
+    )
+
+    # Initialize model
+    model = ClimateModel()
+
+    # Get training arguments
+    training_args = get_training_args(output_dir='./results')
+
+    # Initialize trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset,
+        compute_metrics=compute_metrics,
+    )
+
+    # Train model
+    print("Training model...")
+    trainer.train()
+
+    # Evaluate model
+    print("Evaluating model...")
+    evaluation = trainer.evaluate()
+    print(f"Evaluation results: {evaluation}")
+
+    # Generate predictions for test data
+    print("Generating predictions...")
+    test_outputs = trainer.predict(test_dataset)
+    predicted_labels = test_outputs.predictions.argmax(-1)
+    true_labels = test_df['sentiment'].values
+
+    # Generate classification report
+    target_names = ['anti', 'neutral', 'pro']
+    print("\nClassification Report:")
+    print(classification_report(true_labels, predicted_labels, target_names=target_names))
+
+    # Save model
+    print("Saving model...")
+    trainer.save_model('./saved_model')
+    tokenizer.save_pretrained('./saved_model')
+
+    print("Model training and evaluation complete!")
+
+    # Example prediction
+    example_texts = [
+        "Climate change is a real threat that requires immediate action.",
+        "I'm not sure if humans are causing climate change or if it's natural.",
+        "Global warming is a hoax created by scientists for grant money."
+    ]
+
+    predictions = predict_sentiment(model, tokenizer, example_texts)
+
+    print("\nExample predictions:")
+    for text, pred in zip(example_texts, predictions):
+        print(f"Text: {text}")
+        print(f"Prediction: {pred}\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/models/BERT/utils.py b/models/BERT/utils.py
@@ -0,0 +1,53 @@
+import torch
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+from transformers import TrainingArguments
+
+# Function to compute metrics for evaluation
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = predictions.argmax(axis=1)
+    accuracy = accuracy_score(labels, predictions)
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
+    return {
+        'accuracy': accuracy,
+        'precision': precision,
+        'recall': recall,
+        'f1': f1
+    }
+
+# Training arguments for the Trainer
+def get_training_args(output_dir='./results'):
+    return TrainingArguments(
+        output_dir=output_dir,
+        evaluation_strategy='epoch',
+        save_strategy='epoch',
+        learning_rate=2e-5,
+        per_device_train_batch_size=16,
+        per_device_eval_batch_size=32,
+        num_train_epochs=3,
+        weight_decay=0.01,
+        load_best_model_at_end=True,
+        metric_for_best_model='accuracy',
+        logging_dir='./logs',
+        logging_steps=10
+    )
+
+# Prediction function to use after training
+def predict_sentiment(model, tokenizer, texts):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    model.eval()
+
+    encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
+    encodings = {key: val.to(device) for key, val in encodings.items()}
+
+    with torch.no_grad():
+        # Pass only input_ids and attention_mask for prediction
+        outputs = model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'])
+        # The model should return only logits when labels are not provided
+        logits = outputs
+
+    predictions = torch.argmax(logits, dim=1).cpu().numpy()
+    # Ensure the sentiment map covers all possible prediction indices (0, 1, 2)
+    sentiment_map = {0: 'anti', 1: 'neutral', 2: 'pro'}
+    return [sentiment_map[pred] for pred in predictions]
diff --git a/models/BilSTM/dataset.py b/models/BilSTM/dataset.py
@@ -0,0 +1,16 @@
+import torch
+from torch.utils.data import Dataset
+
+class TweetDataset(Dataset):
+    """
+    Custom PyTorch dataset for sentiment classification.
+    """
+    def __init__(self, inputs, labels):
+        self.inputs = torch.tensor(inputs, dtype=torch.long)
+        self.labels = torch.tensor(labels, dtype=torch.long)
+
+    def __len__(self):
+        return len(self.labels)
+
+    def __getitem__(self, idx):
+        return self.inputs[idx], self.labels[idx]
diff --git a/models/BilSTM/evaluate.py b/models/BilSTM/evaluate.py
@@ -0,0 +1,62 @@
+import torch
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+def predict(model, tokenizer, text_list, word_index, embedding_dim=100, max_len=100, device='cpu'):
+    """
+    Predict sentiment for a list of input texts using the trained BiLSTM model.
+    Returns the predicted sentiment labels.
+    """
+    from keras.preprocessing.sequence import pad_sequences # type: ignore
+    from keras.preprocessing.text import Tokenizer # type: ignore
+
+    model.eval()
+    model.to(device)
+
+    # Use the same tokenizer that was fit on training data
+    sequences = tokenizer.texts_to_sequences(text_list)
+    padded = pad_sequences(sequences, maxlen=max_len)
+    inputs = torch.tensor(padded, dtype=torch.long).to(device)
+
+    with torch.no_grad():
+        outputs = model(inputs)
+        preds = torch.argmax(outputs, dim=1).cpu().numpy()
+
+    label_map = {0: 'anti', 1: 'neutral', 2: 'pro'}
+    return [label_map[p] for p in preds]
+
+def evaluate_model(model, test_loader, device, label_names):
+    """
+    Evaluate the model on the test set and print metrics.
+    """
+    model.eval()
+    all_preds = []
+    all_labels = []
+
+    with torch.no_grad():
+        for inputs, labels in test_loader:
+            inputs = inputs.to(device)
+            outputs = model(inputs)
+            preds = torch.argmax(outputs, dim=1).cpu().numpy()
+            all_preds.extend(preds)
+            all_labels.extend(labels.numpy())
+
+    print("\nClassification Report:")
+    print(classification_report(all_labels, all_preds, target_names=label_names))
+    print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}")
+
+    cm = confusion_matrix(all_labels, all_preds)
+    plot_confusion_matrix(cm, label_names)
+
+
+def plot_confusion_matrix(cm, class_names):
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
+                xticklabels=class_names, yticklabels=class_names)
+    plt.xlabel("Predicted")
+    plt.ylabel("True")
+    plt.title("Confusion Matrix")
+    plt.tight_layout()
+    plt.savefig("bilstm_confusion_matrix.png")
+    plt.close()