Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25,860 changes: 25,817 additions & 43 deletions notebooks/edu_content_human_as_a_judge.ipynb

Large diffs are not rendered by default.

Empty file added src/ml_filter/__init__.py
Empty file.
14 changes: 13 additions & 1 deletion src/ml_filter/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

from ml_filter.analysis.aggregate_scores import aggregate_human_annotations, aggregate_scores
from ml_filter.analysis.collect_ir_metrics import collect_ir_metrics
from ml_filter.analysis.evaluate_predicted_annotations import evaluate_predicted_annotations
from ml_filter.analysis.evaluate_predicted_annotations import (
evaluate_predicted_annotations,
evaluate_prediction_correlation,
)
from ml_filter.analysis.plot_score_distributions import plot_differences_in_scores, plot_scores
from ml_filter.compare_experiments import compare_experiments
from ml_filter.data_processing.deduplication import deduplicate_jsonl
Expand Down Expand Up @@ -287,6 +290,15 @@ def evaluate_predicted_annotations_cli(
)


@main.command(name="evaluate_prediction_correlation")
@input_directory_option
def evaluate_prediction_correlation_cli(
input_directory: Path,
) -> None:
model_filters = ["gemma-3-27b-it", "Llama-3.3-70B-Instruct", "Mistral-Small-3.1-24B-Instruct-2503"]
evaluate_prediction_correlation(input_directory=input_directory, model_filters=model_filters)


@main.command(name="aggregate_scores")
@input_directory_option
@output_directory_option
Expand Down
43 changes: 42 additions & 1 deletion src/ml_filter/analysis/evaluate_predicted_annotations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import itertools
import logging
from pathlib import Path

from ml_filter.analysis.interrater_reliability import compute_interrater_reliability_metrics
from ml_filter.analysis.interrater_reliability import compute_interrater_reliability_metrics, compute_metrics
from ml_filter.analysis.utils import get_common_docs, get_document_scores_df
from ml_filter.utils.logging import get_logger

logger = get_logger(name=__name__, level=logging.INFO) # Set up logging
Expand Down Expand Up @@ -75,3 +77,42 @@ def evaluate_predicted_annotations(
lang=lang,
)
logger.info(f"Metrics successfully written to {lang_dir}")


def evaluate_prediction_correlation(
input_directory: Path,
model_filters: list[str],
) -> None:
# Find all files matching the pattern in the directory and subdirectories
files = list(input_directory.rglob("annotations_*.jsonl"))

# Check if there is at least one file
if len(files) < 2:
raise ValueError(f"No annotation files found in {input_directory} or its subdirectories.")

filtered_file_paths = [
file_path for file_path in files if any(model_filter in str(file_path) for model_filter in model_filters)
]

scores_df = get_document_scores_df(
input_file_paths=filtered_file_paths,
aggregation_strategy="majority",
valid_labels=[0, 1, 2, 3, 4, 5],
)

# create all pairs
model_pairs = list(itertools.combinations(model_filters, 2))
for model_pair in model_pairs:
model_1, model_2 = model_pair
common_docs_df = get_common_docs(scores_df, model_1, model_2)
valid_docs_df = common_docs_df[
(common_docs_df["score_0"] != "invalid") & (common_docs_df["score_1"] != "invalid")
]
valid_docs_df = valid_docs_df[valid_docs_df["prompt_lang"] != "en"]

metrics = compute_metrics(
num_total_docs=len(common_docs_df),
valid_docs_df=valid_docs_df,
thresholds=[1],
)
print(float(metrics["metrics"]["Spearman"]))
39 changes: 34 additions & 5 deletions src/ml_filter/analysis/interrater_reliability.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import pandas as pd
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics import cohen_kappa_score, f1_score
from sklearn.metrics import cohen_kappa_score, f1_score, ndcg_score, precision_score, recall_score
from statsmodels.stats.inter_rater import fleiss_kappa

from ml_filter.analysis.plot_score_distributions import plot_confusion_matrix
Expand Down Expand Up @@ -189,7 +189,33 @@ def compute_gt_metrics(
# Othwerwise, zipping will will proive the wrong results
class_f1_scores = f1_score(ground_truth_rounded, predictions_rounded, average=None, labels=valid_labels)
for valid_label, f1 in zip(valid_labels, class_f1_scores):
gt_metrics[f"F1-{valid_label}"] = f1
gt_metrics[f"F1-{valid_label}_vs_rest"] = f1

# f1 score at threshold
for t in np.array(list(range(5))) + 0.5:
ground_truth_rounded_bin = (np.array(ground_truth_rounded) >= t).astype(int)
predictions_rounded_bin = (np.array(predictions_rounded) >= t).astype(int)
gt_metrics[f"F1-{t}"] = f1_score(
ground_truth_rounded_bin,
predictions_rounded_bin,
labels=[int(valid_label) for valid_label in valid_labels],
zero_division=0,
)
gt_metrics[f"Recall-{t}"] = recall_score(
ground_truth_rounded_bin,
predictions_rounded_bin,
labels=[int(valid_label) for valid_label in valid_labels],
zero_division=0,
)
gt_metrics[f"Precision-{t}"] = precision_score(
ground_truth_rounded_bin,
predictions_rounded_bin,
labels=[int(valid_label) for valid_label in valid_labels],
zero_division=0,
)

# NDCG@all
gt_metrics["NDCG@all"] = ndcg_score(y_true=[ground_truth_scores], y_score=[predicted_scores], k=None)

return gt_metrics

Expand All @@ -215,7 +241,7 @@ def plot_invalid_docs_histogram(
plt.hist(correct_scores_of_invalid_docs, bins=[0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5], alpha=0.5, edgecolor="black")
plt.xlabel("Scores")
plt.ylabel("Frequency")
plt.title(f"Histogram of Invalid Scores for {annotator_name} and langauge {language}.")
plt.title(f"Histogram of invalid scores for {annotator_name} and language {language}.")
plt.grid(True)
plt.savefig(output_file_path)

Expand Down Expand Up @@ -368,12 +394,15 @@ def compare_annotator_to_gt(
gt_idx = 0
ground_truth_scores = valid_docs_df["score_0"].to_list()
predicted_scores = valid_docs_df["score_1"].to_list()
else:
elif annotators[1] == "gt":
annotator_idx = 0
gt_idx = 1
ground_truth_scores = valid_docs_df["score_1"].to_list()
predicted_scores = valid_docs_df["score_0"].to_list()

else:
raise ValueError(f"Expected one of the annotators to be 'gt', but found {annotators[0]} and {annotators[1]}")

annotator_name = annotators[annotator_idx]

gt_metrics = compute_gt_metrics(
Expand All @@ -399,7 +428,7 @@ def compare_annotator_to_gt(
plot_confusion_matrix(
cm_dict=cm,
annotator_name=annotator_name,
output_file_path=output_dir / f"confusion_matrix_{annotator_name}_gt.png",
output_file_path=output_dir / f"confusion_matrix_{annotator_name}_gt_{lang}.pdf",
valid_labels=[int(valid_label) for valid_label in valid_labels],
language=lang,
)
Expand Down
23 changes: 17 additions & 6 deletions src/ml_filter/analysis/plot_score_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,25 @@ def plot_confusion_matrix(
cm_array = np.array(cm_array)

# Plot the confusion matrix
plt.figure(figsize=(10, 6))
plt.figure(figsize=(6.66, 4))

# Normalize the confusion matrix
cm_normalized = cm_array.astype("float") / cm_array.sum(axis=1)[:, np.newaxis]
xlabels = [label if label != -1 else "invalid" for label in all_labels]
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", xticklabels=xlabels, yticklabels=valid_labels)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title(f"Confusion Matrix for {annotator_name} and language {language}.")
plt.savefig(output_file_path)
sns.heatmap(
cm_normalized,
annot=True,
fmt=".2f",
cmap="Blues",
xticklabels=xlabels,
yticklabels=valid_labels,
annot_kws={"size": 14},
)
plt.xlabel("Predicted", fontsize=16)
plt.ylabel("True", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
# plt.title(f"Confusion Matrix for {annotator_name} and language {language}.")
plt.tight_layout()
plt.savefig(output_file_path, format="pdf", bbox_inches="tight")
plt.show()
26 changes: 25 additions & 1 deletion src/ml_filter/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import pandas as pd

from ml_filter.utils.logging import get_logger


def custom_round(x: int | float) -> int:
"""Rounds values > x.5 to x+1 and values < x.5 to x.
Expand Down Expand Up @@ -83,6 +85,10 @@ def get_document_scores_df(
with open(file_path, "r") as f:
for line in f:
json_obj = json.loads(line)
if "document_id" not in json_obj or json_obj["document_id"] is None:
raise ValueError(
f"Document ID is missing in the JSON object: {json_obj}. Please check the input file."
)

# replace invalid scores with None
scores = []
Expand Down Expand Up @@ -124,6 +130,19 @@ def get_document_scores_df(
)

document_scores_df = pd.DataFrame(document_scores)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the filtering necessary? We handle the case of missing / ummatched document IDs later by filtering on documents that are common for the annotators we are currently comparing

# make sure that we have the same number of documents with the same doc_id for each annotator
doc_ids_per_annotator = document_scores_df.groupby(by=["annotator", "prompt", "prompt_lang"])["doc_id"].apply(set)
first_doc_ids = next(iter(doc_ids_per_annotator))
for index, doc_ids in zip(doc_ids_per_annotator.index, doc_ids_per_annotator):
if not doc_ids == first_doc_ids:
if len(doc_ids - first_doc_ids) > 0:
get_logger(name="main").warning(
f"{'__'.join(doc_ids_per_annotator.index[0])} misses: {doc_ids - first_doc_ids}"
)
if len(first_doc_ids - doc_ids) > 0:
get_logger(name="main").warning(f"{'__'.join(index)} misses: {first_doc_ids - doc_ids}")

return document_scores_df


Expand Down Expand Up @@ -171,7 +190,12 @@ def get_common_docs(document_scores_df: pd.DataFrame, annotator_0: str, annotato
f"while annotator {annotator_1} has {len(annotator_1_df)} documents."
)
# only consider documents that are annotated by both annotators and have valid scores
common_docs_df = pd.merge(annotator_0_df, annotator_1_df, on=["doc_id", "prompt"], suffixes=("_0", "_1"))
common_docs_df = pd.merge(
annotator_0_df, annotator_1_df, on=["doc_id", "prompt_lang", "prompt"], suffixes=("_0", "_1")
)

if len(common_docs_df) * 2 != len(document_scores_df):
get_logger(name="main").warning("Not all documents can be matched on columns doc_id and prompt.")

# add rounded scores for each annotator
for idx in (0, 1):
Expand Down