From 2555694bc14def9c06b0484266abf76571e67af2 Mon Sep 17 00:00:00 2001 From: cam1llynha Date: Fri, 24 Oct 2025 13:58:36 -0300 Subject: [PATCH] Add tip for logging evaluation metrics during regular evaluations This PR adds a helpful comment in the DPO script explaining how to log and save evaluation metrics during regular evaluations using a custom callback. It also clarifies W&B behavior regarding metric aggregation. Related to issue #2602. Checklist: - [x] Added clear example for custom callback - [x] Clarified W&B aggregation behavior - [x] No code logic changed, only documentation tip --- trl/scripts/dpo.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/trl/scripts/dpo.py b/trl/scripts/dpo.py index b1c4c7608bf..5ef3e39e70d 100644 --- a/trl/scripts/dpo.py +++ b/trl/scripts/dpo.py @@ -157,6 +157,21 @@ def main(script_args, training_args, model_args, dataset_args): metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) + # 💡 Tip: To log and save evaluation metrics during regular evaluations (not only the final one), +# you can use a custom callback: +# +# from transformers import TrainerCallback +# +# class LogEvalMetricsCallback(TrainerCallback): +# def on_evaluate(self, args, state, control, metrics=None, **kwargs): +# if metrics: +# trainer.log_metrics("eval", metrics) +# trainer.save_metrics("eval", metrics) +# +# trainer = Trainer(..., callbacks=[LogEvalMetricsCallback]) +# +# Note: Metrics logged to Weights & Biases (W&B) are aggregated over the entire evaluation dataset, +# not per batch. For per-batch logging, use `on_prediction_step`. # Save and push to Hub trainer.save_model(training_args.output_dir)