From 3c6ff7cfe347938c885ea19cc10525f1120aaa2a Mon Sep 17 00:00:00 2001 From: jiyzhang Date: Mon, 9 Feb 2026 14:18:28 +0800 Subject: [PATCH] Refactor tokenizer usage in dataset_utils.py 1. issues encountered ``` File "/app/TensorRT-Model-Optimizer/examples/llm_ptq/hf_ptq.py", line 146, in make_calib_dataloader calib_dataloader = get_dataset_dataloader( ^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/modelopt/torch/utils/dataset_utils.py", line 217, in get_dataset_dataloader batch_encoded = tokenizer.batch_encode_plus( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py", line 1291, in __getattr__ raise AttributeError(f"{self.__class__.__name__} has no attribute {key}") AttributeError: Qwen2Tokenizer has no attribute batch_encode_plus. Did you mean: '_encode_plus'? ``` 2. `batch_encode_plus` was deprecated, it's recommended to to `tokenizer(...)` Signed-off-by: jiyzhang --- modelopt/torch/utils/dataset_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 16bff49c2..e154de36e 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -247,7 +247,7 @@ def get_dataset_dataloader( samples = get_dataset_samples(ds_name, num_sample) all_samples.extend(samples) - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( all_samples, return_tensors="pt", padding=True,