Modalities · le1nux · May 6, 2025 · May 6, 2025 · May 6, 2025 · May 9, 2025
diff --git a/notebooks/edu_content_human_as_a_judge.ipynb b/notebooks/edu_content_human_as_a_judge.ipynb
diff --git a/src/ml_filter/__init__.py b/src/ml_filter/__init__.py
diff --git a/src/ml_filter/__main__.py b/src/ml_filter/__main__.py
@@ -10,7 +10,10 @@
 
 from ml_filter.analysis.aggregate_scores import aggregate_human_annotations, aggregate_scores
 from ml_filter.analysis.collect_ir_metrics import collect_ir_metrics
-from ml_filter.analysis.evaluate_predicted_annotations import evaluate_predicted_annotations
+from ml_filter.analysis.evaluate_predicted_annotations import (
+    evaluate_predicted_annotations,
+    evaluate_prediction_correlation,
+)
 from ml_filter.analysis.plot_score_distributions import plot_differences_in_scores, plot_scores
 from ml_filter.compare_experiments import compare_experiments
 from ml_filter.data_processing.deduplication import deduplicate_jsonl
@@ -287,6 +290,15 @@ def evaluate_predicted_annotations_cli(
     )
 
 
+@main.command(name="evaluate_prediction_correlation")
+@input_directory_option
+def evaluate_prediction_correlation_cli(
+    input_directory: Path,
+) -> None:
+    model_filters = ["gemma-3-27b-it", "Llama-3.3-70B-Instruct", "Mistral-Small-3.1-24B-Instruct-2503"]
+    evaluate_prediction_correlation(input_directory=input_directory, model_filters=model_filters)
+
+
 @main.command(name="aggregate_scores")
 @input_directory_option
 @output_directory_option

diff --git a/src/ml_filter/analysis/evaluate_predicted_annotations.py b/src/ml_filter/analysis/evaluate_predicted_annotations.py
@@ -1,7 +1,9 @@
+import itertools
 import logging
 from pathlib import Path
 
-from ml_filter.analysis.interrater_reliability import compute_interrater_reliability_metrics
+from ml_filter.analysis.interrater_reliability import compute_interrater_reliability_metrics, compute_metrics
+from ml_filter.analysis.utils import get_common_docs, get_document_scores_df
 from ml_filter.utils.logging import get_logger
 
 logger = get_logger(name=__name__, level=logging.INFO)  # Set up logging
@@ -75,3 +77,42 @@ def evaluate_predicted_annotations(
             lang=lang,
         )
         logger.info(f"Metrics successfully written to {lang_dir}")
+
+
+def evaluate_prediction_correlation(
+    input_directory: Path,
+    model_filters: list[str],
+) -> None:
+    # Find all files matching the pattern in the directory and subdirectories
+    files = list(input_directory.rglob("annotations_*.jsonl"))
+
+    # Check if there is at least one file
+    if len(files) < 2:
+        raise ValueError(f"No annotation files found in {input_directory} or its subdirectories.")
+
+    filtered_file_paths = [
+        file_path for file_path in files if any(model_filter in str(file_path) for model_filter in model_filters)
+    ]
+
+    scores_df = get_document_scores_df(
+        input_file_paths=filtered_file_paths,
+        aggregation_strategy="majority",
+        valid_labels=[0, 1, 2, 3, 4, 5],
+    )
+
+    # create all pairs
+    model_pairs = list(itertools.combinations(model_filters, 2))
+    for model_pair in model_pairs:
+        model_1, model_2 = model_pair
+        common_docs_df = get_common_docs(scores_df, model_1, model_2)
+        valid_docs_df = common_docs_df[
+            (common_docs_df["score_0"] != "invalid") & (common_docs_df["score_1"] != "invalid")
+        ]
+        valid_docs_df = valid_docs_df[valid_docs_df["prompt_lang"] != "en"]
+
+        metrics = compute_metrics(
+            num_total_docs=len(common_docs_df),
+            valid_docs_df=valid_docs_df,
+            thresholds=[1],
+        )
+        print(float(metrics["metrics"]["Spearman"]))
diff --git a/src/ml_filter/analysis/interrater_reliability.py b/src/ml_filter/analysis/interrater_reliability.py
@@ -10,7 +10,7 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import kendalltau, spearmanr
-from sklearn.metrics import cohen_kappa_score, f1_score
+from sklearn.metrics import cohen_kappa_score, f1_score, ndcg_score, precision_score, recall_score
 from statsmodels.stats.inter_rater import fleiss_kappa
 
 from ml_filter.analysis.plot_score_distributions import plot_confusion_matrix
@@ -189,7 +189,33 @@ def compute_gt_metrics(
     # Othwerwise, zipping will will proive the wrong results
     class_f1_scores = f1_score(ground_truth_rounded, predictions_rounded, average=None, labels=valid_labels)
     for valid_label, f1 in zip(valid_labels, class_f1_scores):
-        gt_metrics[f"F1-{valid_label}"] = f1
+        gt_metrics[f"F1-{valid_label}_vs_rest"] = f1
+
+    # f1 score at threshold
+    for t in np.array(list(range(5))) + 0.5:
+        ground_truth_rounded_bin = (np.array(ground_truth_rounded) >= t).astype(int)
+        predictions_rounded_bin = (np.array(predictions_rounded) >= t).astype(int)
+        gt_metrics[f"F1-{t}"] = f1_score(
+            ground_truth_rounded_bin,
+            predictions_rounded_bin,
+            labels=[int(valid_label) for valid_label in valid_labels],
+            zero_division=0,
+        )
+        gt_metrics[f"Recall-{t}"] = recall_score(
+            ground_truth_rounded_bin,
+            predictions_rounded_bin,
+            labels=[int(valid_label) for valid_label in valid_labels],
+            zero_division=0,
+        )
+        gt_metrics[f"Precision-{t}"] = precision_score(
+            ground_truth_rounded_bin,
+            predictions_rounded_bin,
+            labels=[int(valid_label) for valid_label in valid_labels],
+            zero_division=0,
+        )
+
+    # NDCG@all
+    gt_metrics["NDCG@all"] = ndcg_score(y_true=[ground_truth_scores], y_score=[predicted_scores], k=None)
 
     return gt_metrics
 
@@ -215,7 +241,7 @@ def plot_invalid_docs_histogram(
     plt.hist(correct_scores_of_invalid_docs, bins=[0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5], alpha=0.5, edgecolor="black")
     plt.xlabel("Scores")
     plt.ylabel("Frequency")
-    plt.title(f"Histogram of Invalid Scores for {annotator_name} and langauge {language}.")
+    plt.title(f"Histogram of invalid scores for {annotator_name} and language {language}.")
     plt.grid(True)
     plt.savefig(output_file_path)
 
@@ -368,12 +394,15 @@ def compare_annotator_to_gt(
         gt_idx = 0
         ground_truth_scores = valid_docs_df["score_0"].to_list()
         predicted_scores = valid_docs_df["score_1"].to_list()
-    else:
+    elif annotators[1] == "gt":
         annotator_idx = 0
         gt_idx = 1
         ground_truth_scores = valid_docs_df["score_1"].to_list()
         predicted_scores = valid_docs_df["score_0"].to_list()
 
+    else:
+        raise ValueError(f"Expected one of the annotators to be 'gt', but found {annotators[0]} and {annotators[1]}")
+
     annotator_name = annotators[annotator_idx]
 
     gt_metrics = compute_gt_metrics(
@@ -399,7 +428,7 @@ def compare_annotator_to_gt(
     plot_confusion_matrix(
         cm_dict=cm,
         annotator_name=annotator_name,
-        output_file_path=output_dir / f"confusion_matrix_{annotator_name}_gt.png",
+        output_file_path=output_dir / f"confusion_matrix_{annotator_name}_gt_{lang}.pdf",
         valid_labels=[int(valid_label) for valid_label in valid_labels],
         language=lang,
     )

diff --git a/src/ml_filter/analysis/plot_score_distributions.py b/src/ml_filter/analysis/plot_score_distributions.py
@@ -177,14 +177,25 @@ def plot_confusion_matrix(
     cm_array = np.array(cm_array)
 
     # Plot the confusion matrix
-    plt.figure(figsize=(10, 6))
+    plt.figure(figsize=(6.66, 4))
 
     # Normalize the confusion matrix
     cm_normalized = cm_array.astype("float") / cm_array.sum(axis=1)[:, np.newaxis]
     xlabels = [label if label != -1 else "invalid" for label in all_labels]
-    sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", xticklabels=xlabels, yticklabels=valid_labels)
-    plt.xlabel("Predicted")
-    plt.ylabel("True")
-    plt.title(f"Confusion Matrix for {annotator_name} and language {language}.")
-    plt.savefig(output_file_path)
+    sns.heatmap(
+        cm_normalized,
+        annot=True,
+        fmt=".2f",
+        cmap="Blues",
+        xticklabels=xlabels,
+        yticklabels=valid_labels,
+        annot_kws={"size": 14},
+    )
+    plt.xlabel("Predicted", fontsize=16)
+    plt.ylabel("True", fontsize=16)
+    plt.xticks(fontsize=14)
+    plt.yticks(fontsize=14)
+    # plt.title(f"Confusion Matrix for {annotator_name} and language {language}.")
+    plt.tight_layout()
+    plt.savefig(output_file_path, format="pdf", bbox_inches="tight")
     plt.show()
diff --git a/src/ml_filter/analysis/utils.py b/src/ml_filter/analysis/utils.py
@@ -6,6 +6,8 @@
 
 import pandas as pd
 
+from ml_filter.utils.logging import get_logger
+
 
 def custom_round(x: int | float) -> int:
     """Rounds values > x.5 to x+1 and values < x.5 to x.
@@ -83,6 +85,10 @@ def get_document_scores_df(
         with open(file_path, "r") as f:
             for line in f:
                 json_obj = json.loads(line)
+                if "document_id" not in json_obj or json_obj["document_id"] is None:
+                    raise ValueError(
+                        f"Document ID is missing in the JSON object: {json_obj}. Please check the input file."
+                    )
 
                 # replace invalid scores with None
                 scores = []
@@ -124,6 +130,19 @@ def get_document_scores_df(
                 )
 
     document_scores_df = pd.DataFrame(document_scores)
+
+    # make sure that we have the same number of documents with the same doc_id for each annotator
+    doc_ids_per_annotator = document_scores_df.groupby(by=["annotator", "prompt", "prompt_lang"])["doc_id"].apply(set)
+    first_doc_ids = next(iter(doc_ids_per_annotator))
+    for index, doc_ids in zip(doc_ids_per_annotator.index, doc_ids_per_annotator):
+        if not doc_ids == first_doc_ids:
+            if len(doc_ids - first_doc_ids) > 0:
+                get_logger(name="main").warning(
+                    f"{'__'.join(doc_ids_per_annotator.index[0])} misses: {doc_ids - first_doc_ids}"
+                )
+            if len(first_doc_ids - doc_ids) > 0:
+                get_logger(name="main").warning(f"{'__'.join(index)} misses: {first_doc_ids - doc_ids}")
+
     return document_scores_df
 
 
@@ -171,7 +190,12 @@ def get_common_docs(document_scores_df: pd.DataFrame, annotator_0: str, annotato
             f"while annotator {annotator_1} has {len(annotator_1_df)} documents."
         )
     # only consider documents that are annotated by both annotators and have valid scores
-    common_docs_df = pd.merge(annotator_0_df, annotator_1_df, on=["doc_id", "prompt"], suffixes=("_0", "_1"))
+    common_docs_df = pd.merge(
+        annotator_0_df, annotator_1_df, on=["doc_id", "prompt_lang", "prompt"], suffixes=("_0", "_1")
+    )
+
+    if len(common_docs_df) * 2 != len(document_scores_df):
+        get_logger(name="main").warning("Not all documents can be matched on columns doc_id and prompt.")
 
     # add rounded scores for each annotator
     for idx in (0, 1):