From b43bfa80a619fa07332385df4aa88509f43fa982 Mon Sep 17 00:00:00 2001 From: amstu2 <11323044+amstu2@users.noreply.github.com> Date: Wed, 24 Sep 2025 06:58:27 +0000 Subject: [PATCH 1/3] Fix deberta overflow error --- src/lighteval/metrics/imports/bert_scorer.py | 17 +++++++++++++++++ src/lighteval/metrics/metrics_sample.py | 6 +++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index b8025bf3f..b03632051 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -37,6 +37,16 @@ logger = logging.getLogger(__name__) +def validate_tokenizer_length(tokenizer: AutoTokenizer, override_length: int | None) -> int: + if override_length: + return override_length + if tokenizer.model_max_length == int(1e30): + logger.warning("Could not read max_model_length attribute for BERTScorer's tokenizer - defaulting to 512.") + return 512 + else: + return tokenizer.max_model_length + + def padding(arr, pad_token, dtype=torch.long): lens = torch.LongTensor([len(a) for a in arr]) max_len = lens.max().item() @@ -321,6 +331,7 @@ def __init__( lang=None, rescale_with_baseline=False, baseline_path=None, + tokenizer_max_len: int | None = None, ): """Initialize BERTScorer. @@ -343,6 +354,7 @@ def __init__( return_hash (bool): Return hash code of the setting. rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline. baseline_path (str): Customized baseline file. + tokenizer_max_len (int, optional): will override the tokenizer's max model length if set. """ assert lang is not None or model_type is not None, "Either lang or model_type should be specified" @@ -366,6 +378,7 @@ def __init__( # Model and tokenizer are lazily loaded in `score()`. self._tokenizer = None + self._tokenizer_len = tokenizer_max_len self._model = None self._idf_dict = None @@ -430,6 +443,10 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False): if self._model is None: logger.info(f"Loading BERTScorer model `{self._model_type}`") self._tokenizer = AutoTokenizer.from_pretrained(self._model_type) + self._tokenizer.max_model_length = validate_tokenizer_length( + tokenizer=self._tokenizer, override_length=self._tokenizer_len + ) + self._model = AutoModel.from_pretrained(self._model_type) self._model.eval() self._model.to(self.device) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 25b4f68ff..37d4e40cb 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -643,7 +643,11 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str logger.warning("The first metric computation step might be a bit longer as we need to download the model.") # We only initialize on first compute self.bert_scorer = BERTScorer( - model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, num_layers=9 + model_type="microsoft/deberta-large-mnli", + lang="en", + rescale_with_baseline=True, + num_layers=9, + tokenizer_max_len=512, ) golds = as_list(golds) predictions = as_list(predictions) From be11df1643820c5cfabbed9a132b8b8217405b77 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 24 Sep 2025 13:40:06 +0200 Subject: [PATCH 2/3] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/lighteval/metrics/imports/bert_scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index b03632051..0ca2cde64 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -44,7 +44,7 @@ def validate_tokenizer_length(tokenizer: AutoTokenizer, override_length: int | N logger.warning("Could not read max_model_length attribute for BERTScorer's tokenizer - defaulting to 512.") return 512 else: - return tokenizer.max_model_length + return tokenizer.model_max_length def padding(arr, pad_token, dtype=torch.long): From 6c60a3b39f2a1571f7ec494c071c42eeaade3491 Mon Sep 17 00:00:00 2001 From: amstu2 <11323044+amstu2@users.noreply.github.com> Date: Sat, 27 Sep 2025 10:59:16 +1000 Subject: [PATCH 3/3] FIx BERTscore incorrect tokenizer attribute --- src/lighteval/metrics/imports/bert_scorer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index 0ca2cde64..88a063c6a 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -443,10 +443,9 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False): if self._model is None: logger.info(f"Loading BERTScorer model `{self._model_type}`") self._tokenizer = AutoTokenizer.from_pretrained(self._model_type) - self._tokenizer.max_model_length = validate_tokenizer_length( + self._tokenizer.model_max_length = validate_tokenizer_length( tokenizer=self._tokenizer, override_length=self._tokenizer_len ) - self._model = AutoModel.from_pretrained(self._model_type) self._model.eval() self._model.to(self.device)