From 2555694bc14def9c06b0484266abf76571e67af2 Mon Sep 17 00:00:00 2001
From: cam1llynha <camillylealc@gmail.com>
Date: Fri, 24 Oct 2025 13:58:36 -0300
Subject: [PATCH] Add tip for logging evaluation metrics during regular
 evaluations

This PR adds a helpful comment in the DPO script explaining how to log and save evaluation metrics during regular evaluations using a custom callback. It also clarifies W&B behavior regarding metric aggregation.

Related to issue #2602.

Checklist:
- [x] Added clear example for custom callback
- [x] Clarified W&B aggregation behavior
- [x] No code logic changed, only documentation tip
---
 trl/scripts/dpo.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/trl/scripts/dpo.py b/trl/scripts/dpo.py
index b1c4c7608bf..5ef3e39e70d 100644
--- a/trl/scripts/dpo.py
+++ b/trl/scripts/dpo.py
@@ -157,6 +157,21 @@ def main(script_args, training_args, model_args, dataset_args):
         metrics = trainer.evaluate()
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
+        # 💡 Tip: To log and save evaluation metrics during regular evaluations (not only the final one),
+# you can use a custom callback:
+#
+# from transformers import TrainerCallback
+#
+# class LogEvalMetricsCallback(TrainerCallback):
+#     def on_evaluate(self, args, state, control, metrics=None, **kwargs):
+#         if metrics:
+#             trainer.log_metrics("eval", metrics)
+#             trainer.save_metrics("eval", metrics)
+#
+# trainer = Trainer(..., callbacks=[LogEvalMetricsCallback])
+#
+# Note: Metrics logged to Weights & Biases (W&B) are aggregated over the entire evaluation dataset,
+# not per batch. For per-batch logging, use `on_prediction_step`.
 
     # Save and push to Hub
     trainer.save_model(training_args.output_dir)