Natural-Language-Processing/evaluation.py at main · AgatheZ/Natural-Language-Processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import RobertaTokenizer
import PCLDataset
from torch.utils.data import DataLoader
import tqdm

# Function to plot loss curves from a trained model history
def plot_loss_curves(history, model_name):

  epochs = []
  steps = []
  train_losses = []
  val_losses = []

  # Get log items from history dictionary
  for i, log in enumerate(history[:-1]): # get rid of last log entry with no eval loss
    if i % 2 == 0:
      epochs.append(log['epoch'])
      steps.append(log['step'])
      train_losses.append(log['loss'])
    elif i % 2 != 0:
      val_losses.append(log['eval_loss'])

  # Plot figure
  plt.figure(figsize=(10,6))
  plt.plot(steps, train_losses, label='Training loss')
  plt.plot(steps, val_losses, label='Validation loss')
  plt.title(f'Loss Curves for {model_name}')
  plt.ylabel('Loss (cross-entropy)')
  plt.xlabel('Steps')
  plt.legend()

def predict_pcl(input, tokenizer, model):
  model.eval()
  encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=128)
  output = model(**encodings)
  preds = torch.max(output, 1)
  return {'prediction':preds[1], 'confidence':preds[0]}

# Function to calculate F1 score during training
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    return {"PCL (Class 1) F1": f1}

# Function to generate evaluation metrics of a model
def evaluate(model, eval_set, tokenizer=None, evaluate_wrong_predictions=True, evaluation_set="VALIDATION", binary_labels=False, electra_2=False):

  if tokenizer==None:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')


  val_set_copy = eval_set.copy()

  if not binary_labels:
      val_set_copy['labels'] = val_set_copy['labels'].apply(get_orig_label)

  val_dataloader = DataLoader(PCLDataset(tokenizer, val_set_copy.to_dict(orient = 'list')))


  preds = []
  tot_labels = []
  wrong_predictions = []

  with torch.no_grad():
    for data in tqdm(val_dataloader):

      labels = {}
      labels['labels'] = data['labels']

      text = data['text']

      if not binary_labels:
          pred = predict_pcl(text, tokenizer, model)

      elif binary_labels:
          if electra_2:
            pred = predict_pcl_electra(text, tokenizer, model)
          else:
            pred = predict_pcl_2(text, tokenizer, model)

      if pred['prediction'] != labels['labels']:
        wrong_predictions.append(text)

      preds.append(pred['prediction'].tolist())
      tot_labels.append(labels['labels'].tolist())


  # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
  report = classification_report(tot_labels, preds, target_names=["Not PCL","PCL"])
  cm = confusion_matrix(tot_labels, preds)

  print('\n')
  print(f"EVALUATION METRICS FROM {evaluation_set} DATA")
  print("----------------------------------------------------------")
  print(report)
  print('\n')
  print(f"CONFUSION MATRIX FROM {evaluation_set} DATA")
  print("----------------------------------------------------------")
  print(cm)
  print('\n')

  if evaluate_wrong_predictions:
      wrong_prediction_indices = []

      for wrong_pred in wrong_predictions:
        index = eval_set.index[eval_set['text'] == wrong_pred[0]]
        wrong_prediction_indices.append(index[0])

      wrong_preds_df = val_set[val_set.index.isin(wrong_prediction_indices)]
      print(f"SAMPLE OF 5 WRONG PREDICTIONS FROM {evaluation_set} DATA:")
      print("----------------------------------------------------------")
      for p in wrong_preds_df.text.values[:5]:
        print(p)
  return
# Function to generate evaluation metrics of a model

def test_predict(model, eval_set, tokenizer):

  val_set_copy = eval_set.copy()
  val_dataloader = DataLoader(PCLDataset(tokenizer, val_set_copy.to_dict(orient = 'list')))

  preds = []

  with torch.no_grad():
    for data in tqdm(val_dataloader):

      labels = {}
      text = data['text']
      pred = predict_pcl_2(text, tokenizer, model)

      preds.append(pred['prediction'].tolist())

  return preds