From cd0b80ed7877cb3981759b5c84c249fedd5587f9 Mon Sep 17 00:00:00 2001 From: Indrajit Banerjee Date: Thu, 30 Oct 2025 13:29:14 +0530 Subject: [PATCH] fix `pad_to_max_length` -> `padding` during transformers tokenizer encode Reproduction steps: transformers == 4.55.0 download alpaca dataset run command python -m QEfficient.cloud.finetune --device qaic --model_name \ meta-llama/Llama-3.1-8B --tokenizer_name meta-llama/Llama-3.1-8B \ --max_train_step 20 --output_dir \ meta-llama/Llama-3.1-8B_alpaca_dataset_run --dataset alpaca_dataset \ --run_validation False Expected error: TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'pad_to_max_length' Signed-off-by: Indrajit Banerjee --- QEfficient/finetune/dataset/alpaca_dataset.py | 4 ++-- .../finetune/dataset/custom_dataset/sample_dataset_preproc.py | 4 ++-- QEfficient/finetune/dataset/grammar_dataset.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/QEfficient/finetune/dataset/alpaca_dataset.py b/QEfficient/finetune/dataset/alpaca_dataset.py index c6ddb6ce1..e99bb3595 100644 --- a/QEfficient/finetune/dataset/alpaca_dataset.py +++ b/QEfficient/finetune/dataset/alpaca_dataset.py @@ -59,9 +59,9 @@ def __getitem__(self, index): prompt = PROMPT_DICT["prompt_input"].format_map(ann) example = prompt + ann["output"] prompt = torch.tensor( - self.tokenizer.encode(prompt, max_length=self.context_length, pad_to_max_length=True), dtype=torch.int64 + self.tokenizer.encode(prompt, max_length=self.context_length, padding=True), dtype=torch.int64 ) - example = self.tokenizer.encode(example, max_length=self.context_length, pad_to_max_length=True) + example = self.tokenizer.encode(example, max_length=self.context_length, padding=True) example.append(self.tokenizer.eos_token_id) example = torch.tensor(example, dtype=torch.int64) labels = copy.deepcopy(example) diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py index 78db5674c..3ce8fc5bf 100644 --- a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py +++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py @@ -65,13 +65,13 @@ def tokenize_add_label(sample): tokenizer.bos_token + sample["input"], add_special_tokens=False, max_length=context_length, - pad_to_max_length=True, + padding=True, ) label = tokenizer.encode( sample["label"] + tokenizer.pad_token + tokenizer.eos_token, add_special_tokens=False, max_length=context_length, - pad_to_max_length=True, + padding=True, ) sample = { diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py index e40c01e97..01970722a 100644 --- a/QEfficient/finetune/dataset/grammar_dataset.py +++ b/QEfficient/finetune/dataset/grammar_dataset.py @@ -48,13 +48,13 @@ def convert_to_features(self, example_batch): self.tokenizer.bos_token + prompt, add_special_tokens=False, max_length=self.context_length, - pad_to_max_length=True, + padding=True, ) label_ids = self.tokenizer.encode( target_ + self.tokenizer.eos_token, add_special_tokens=False, max_length=self.context_length, - pad_to_max_length=True, + padding=True, ) sample = {